Introduction

The data set is a smaller portion of a large dataset with information of number of incidents and injuries in various mine sites across the US. the data includes types of mine, nature of incidents, date and times of the incidents and geographical location of the incidents.

We will start by loading these relevant libraries.

library(tidyverse)
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.1 ──
## ✓ ggplot2 3.3.5     ✓ purrr   0.3.4
## ✓ tibble  3.1.3     ✓ dplyr   1.0.7
## ✓ tidyr   1.1.3     ✓ stringr 1.4.0
## ✓ readr   2.0.0     ✓ forcats 0.5.1
## Warning: package 'ggplot2' was built under R version 4.1.1
## Warning: package 'stringr' was built under R version 4.1.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()
library(ggplot2)
library(scales)
## 
## Attaching package: 'scales'
## The following object is masked from 'package:purrr':
## 
##     discard
## The following object is masked from 'package:readr':
## 
##     col_factor
library(dplyr)
library(gridExtra)
## Warning: package 'gridExtra' was built under R version 4.1.1
## 
## Attaching package: 'gridExtra'
## The following object is masked from 'package:dplyr':
## 
##     combine
library(stringr)
library(DT)
library(sp)
library(maps)
## 
## Attaching package: 'maps'
## The following object is masked from 'package:purrr':
## 
##     map
library(tigris)
## To enable 
## caching of data, set `options(tigris_use_cache = TRUE)` in your R script or .Rprofile.
library(doBy)
## 
## Attaching package: 'doBy'
## The following object is masked from 'package:dplyr':
## 
##     order_by
library(usmap)
## Warning: package 'usmap' was built under R version 4.1.1
library(plotly)
## 
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
## 
##     last_plot
## The following object is masked from 'package:stats':
## 
##     filter
## The following object is masked from 'package:graphics':
## 
##     layout

The data will be imported from the “.csv” file saved in the working directory.

my_dataglob <-read.csv('us_data_2000.csv', header=T, sep=",")

Lets check for data column names to identify the variables.

colnames(my_dataglob)
##  [1] "MINE_ID"             "CONTROLLER_ID"       "CONTROLLER_NAME"    
##  [4] "OPERATOR_ID"         "OPERATOR_NAME"       "CONTRACTOR_ID"      
##  [7] "DOCUMENT_NO"         "SUBUNIT_CD"          "SUBUNIT"            
## [10] "ACCIDENT_DT"         "CAL_YR"              "CAL_QTR"            
## [13] "FISCAL_YR"           "FISCAL_QTR"          "ACCIDENT_TIME"      
## [16] "DEGREE_INJURY_CD"    "DEGREE_INJURY"       "FIPS_STATE_CD"      
## [19] "UG_LOCATION_CD"      "UG_LOCATION"         "UG_MINING_METHOD_CD"
## [22] "UG_MINING_METHOD"    "MINING_EQUIP_CD"     "MINING_EQUIP"       
## [25] "EQUIP_MFR_CD"        "EQUIP_MFR_NAME"      "EQUIP_MODEL_NO"     
## [28] "SHIFT_BEGIN_TIME"    "CLASSIFICATION_CD"   "CLASSIFICATION"     
## [31] "ACCIDENT_TYPE_CD"    "ACCIDENT_TYPE"       "NO_INJURIES"        
## [34] "TOT_EXPER"           "MINE_EXPER"          "JOB_EXPER"          
## [37] "OCCUPATION_CD"       "OCCUPATION"          "ACTIVITY_CD"        
## [40] "ACTIVITY"            "INJURY_SOURCE_CD"    "INJURY_SOURCE"      
## [43] "NATURE_INJURY_CD"    "NATURE_INJURY"       "INJ_BODY_PART_CD"   
## [46] "INJ_BODY_PART"       "SCHEDULE_CHARGE"     "DAYS_RESTRICT"      
## [49] "DAYS_LOST"           "TRANS_TERM"          "RETURN_TO_WORK_DT"  
## [52] "IMMED_NOTIFY_CD"     "IMMED_NOTIFY"        "INVEST_BEGIN_DT"    
## [55] "NARRATIVE"           "CLOSED_DOC_NO"       "COAL_METAL_IND"

A quick view of the columns of “my_dataglob” object shows us that it has 57 variables.

Lets use the head function for only the first row.

head(my_dataglob, 1)
##   MINE_ID CONTROLLER_ID CONTROLLER_NAME OPERATOR_ID         OPERATOR_NAME
## 1  100003         41044    Lhoist Group      L13586 Lhoist North America 
##   CONTRACTOR_ID DOCUMENT_NO SUBUNIT_CD                SUBUNIT ACCIDENT_DT
## 1               2.20121e+11          3 STRIP, QUARY, OPEN PIT  14/03/2012
##   CAL_YR CAL_QTR FISCAL_YR FISCAL_QTR ACCIDENT_TIME DEGREE_INJURY_CD
## 1   2012       1      2012          2           945                5
##                   DEGREE_INJURY FIPS_STATE_CD UG_LOCATION_CD    UG_LOCATION
## 1 DAYS RESTRICTED ACTIVITY ONLY             1              ? NO VALUE FOUND
##   UG_MINING_METHOD_CD UG_MINING_METHOD MINING_EQUIP_CD
## 1                   ?   NO VALUE FOUND              24
##                                                         MINING_EQUIP
## 1 Front-end loader, Tractor-shovel, Payloader, Highlift, Skip loader
##   EQUIP_MFR_CD   EQUIP_MFR_NAME EQUIP_MODEL_NO SHIFT_BEGIN_TIME
## 1          119 Not on this list          22321              600
##   CLASSIFICATION_CD  CLASSIFICATION ACCIDENT_TYPE_CD
## 1                12 POWERED HAULAGE               21
##                    ACCIDENT_TYPE NO_INJURIES TOT_EXPER MINE_EXPER JOB_EXPER
## 1 CGHT I, U, B, MVNG & STTN OBJS           1      4.35       4.35      0.67
##   OCCUPATION_CD
## 1           374
##                                                                                              OCCUPATION
## 1 Warehouseman, Bagger, Palletizer/Stacker, Store keeper, Packager, Fabricator, Cleaning plant operator
##   ACTIVITY_CD                    ACTIVITY INJURY_SOURCE_CD
## 1          28 HANDLING SUPPLIES/MATERIALS               76
##             INJURY_SOURCE NATURE_INJURY_CD             NATURE_INJURY
## 1 SURFACE MINING MACHINES              160 CONTUSN,BRUISE,INTAC SKIN
##   INJ_BODY_PART_CD                        INJ_BODY_PART SCHEDULE_CHARGE
## 1              700 MULTIPLE PARTS (MORE THAN ONE MAJOR)               0
##   DAYS_RESTRICT DAYS_LOST TRANS_TERM RETURN_TO_WORK_DT IMMED_NOTIFY_CD
## 1             8         0          N        03/26/2012              ? 
##     IMMED_NOTIFY INVEST_BEGIN_DT
## 1 NO VALUE FOUND                
##                                                                                                                                                                                                NARRATIVE
## 1 Employee was cleaning up at the Primary Crusher with the Dingo skid steer. The employee slipped and fell while operating the skid steer and the machine  pinned him against the cement retaining wall.
##   CLOSED_DOC_NO COAL_METAL_IND
## 1            NA              M

The head function give us quick overview of the first row of data set, values of each column are visible and we can identify the relevant columns to our analysis.

Lets check for data class in each column to make manipulation easier.

sapply(my_dataglob,class)
##             MINE_ID       CONTROLLER_ID     CONTROLLER_NAME         OPERATOR_ID 
##           "integer"         "character"         "character"         "character" 
##       OPERATOR_NAME       CONTRACTOR_ID         DOCUMENT_NO          SUBUNIT_CD 
##         "character"         "character"           "numeric"           "integer" 
##             SUBUNIT         ACCIDENT_DT              CAL_YR             CAL_QTR 
##         "character"         "character"           "integer"           "integer" 
##           FISCAL_YR          FISCAL_QTR       ACCIDENT_TIME    DEGREE_INJURY_CD 
##           "integer"           "integer"           "integer"         "character" 
##       DEGREE_INJURY       FIPS_STATE_CD      UG_LOCATION_CD         UG_LOCATION 
##         "character"           "integer"         "character"         "character" 
## UG_MINING_METHOD_CD    UG_MINING_METHOD     MINING_EQUIP_CD        MINING_EQUIP 
##         "character"         "character"         "character"         "character" 
##        EQUIP_MFR_CD      EQUIP_MFR_NAME      EQUIP_MODEL_NO    SHIFT_BEGIN_TIME 
##         "character"         "character"         "character"           "integer" 
##   CLASSIFICATION_CD      CLASSIFICATION    ACCIDENT_TYPE_CD       ACCIDENT_TYPE 
##         "character"         "character"         "character"         "character" 
##         NO_INJURIES           TOT_EXPER          MINE_EXPER           JOB_EXPER 
##           "integer"           "numeric"           "numeric"           "numeric" 
##       OCCUPATION_CD          OCCUPATION         ACTIVITY_CD            ACTIVITY 
##         "character"         "character"         "character"         "character" 
##    INJURY_SOURCE_CD       INJURY_SOURCE    NATURE_INJURY_CD       NATURE_INJURY 
##         "character"         "character"         "character"         "character" 
##    INJ_BODY_PART_CD       INJ_BODY_PART     SCHEDULE_CHARGE       DAYS_RESTRICT 
##         "character"         "character"           "integer"           "integer" 
##           DAYS_LOST          TRANS_TERM   RETURN_TO_WORK_DT     IMMED_NOTIFY_CD 
##           "integer"         "character"         "character"         "character" 
##        IMMED_NOTIFY     INVEST_BEGIN_DT           NARRATIVE       CLOSED_DOC_NO 
##         "character"         "character"         "character"           "numeric" 
##      COAL_METAL_IND 
##         "character"

The data class is visible through the above function, it shows us the type of observations for each variable.

We will start by segregating relevant column data, we will remove additional/ repeated/ redundant information. The new data frame will be saved into another variable named “mydataw”.

mydataw<-my_dataglob[,-c(7, 56)] #removes document no, closed document columns
#makes dataframe for working
checkyr<-table(mydataw$CAL_YR)
yrdf<-as.data.frame(checkyr)
low<-yrdf[1,1]
high<-yrdf[16, 1]
#high<-max(yrdf$Var1)
range<-paste("Year", low,"to","Year", high, sep=" ")

Lets also check the date range of the data so we are aware from which years is the data from.

cat(range)
## Year 2000 to Year 2015

Lets check for columns that have NA values and also the count of NA values.

nacount<-apply(is.na(mydataw),2,  sum) #get a look at NA in Data, 2 is margin value for columns
dispna<- (nacount>0)
nacount[dispna]
##   EQUIP_MODEL_NO SHIFT_BEGIN_TIME        TOT_EXPER       MINE_EXPER 
##                1                9              360              333 
##        JOB_EXPER  SCHEDULE_CHARGE    DAYS_RESTRICT        DAYS_LOST 
##              328              673              543              422

As we can see that the “NA” are predominantly in experience and lost time injury related columns. We will alter this as per requirement for data visualization.

Lets start by checking how many mine sites have had multiple incidents in this data set.

mincd<-table(mydataw$MINE_ID)# table of mine incidents, mine id with no. of incident
tdf<-as.data.frame(mincd) # temporary dataframe
nmines <- c() #vector count
for (i in 1:max(tdf$Freq)) {
  nmines <- c(nmines, 0)
} #creating a vector based on highest number of incidents reported from mine
a<-max(tdf$Freq)
s<-seq(0,a)
for (i in s){
a<-(tdf$Freq==i)
nmines[i]<-sum(a)
}
nmines<-cbind(nmines, c(1:length(nmines)))
mineincd<-as.data.frame(nmines) #gives a dataframe of no of incidents vs No. of mines
mineincd<-mineincd[mineincd$V2>1,]
plot1<-ggplot(mineincd)+geom_histogram(mapping=aes(x=V2, y=nmines), stat="identity", binwidth=1)+labs(x="No. of Incidents", y="No.of mines") +  annotate("text", x = 25, y = 80, label = paste("Outlier of ","data, mine"," with 26 incidents", sep="\n")) + annotate("text", x=6, y=120, label=paste("Most data is"," concentrated here", "most mines have", "reported 2 incident", sep="\n")) + scale_x_continuous(breaks=c(1,2,3,4,5,6,7,8,9,10,15,20,26))+scale_y_continuous(breaks=c(1, 10, 25, 50,100, 125, 150))

The histogram shows how many mines have had a certain number of incidents more than 1, as almost 730 mines have reported 1 incident at least we have removed that from the graph to make the plot more visually aesthetic:

plot1

We can see that very few mines have reported incidents greater than 10. The one outlier is a mine that has reported 26 incidents.

Now let us check which operators were running the mine site as per permission granted by MSHS with respect to no. of incident reported.

opincd<-table(mydataw$OPERATOR_NAME)
tdf1<-as.data.frame(opincd)#data frame with operator name and no. of incidents

Since most companies have reported one incident, lets look at multiple incidents reported by Operators.

tdf2<-tdf1$Freq>=5
tdf3<-tdf1[tdf2,]
more5plot<-ggplot(tdf3)+geom_histogram(mapping=aes(x=Freq, y=Var1), stat="identity")+scale_x_continuous(breaks= c(5,10,15,20,25,30, 35, 40))+labs(x="No. of Incidents", y="Operator Name")

tdf2<-tdf1$Freq<5 & tdf1$Freq>=2
tdf3<-tdf1[tdf2,]
more2incdplot<-ggplot(tdf3)+geom_histogram(mapping=aes(x=Freq, y=Var1), stat="identity") +labs(x="No. of Incidents", y="Operator Name")

tdf2<-tdf1$Freq<2
tdf3<-tdf1[tdf2,]
row<-length(tdf3$Freq)
colnames(tdf3)<-c("Operator Name", "No.Of Incident")
rownames(tdf3)<-c(1:row)
#plot4<-ggplot(tdf3)+geom_histogram(mapping=aes(x=Freq, y=Var1), stat="identity")

high25df <- tdf1[order( tdf1$Freq, decreasing = TRUE),] 
high25df<-high25df[1:25,]
top25plot<-ggplot(high25df)+geom_histogram(mapping=aes(x=Freq, y=reorder(Var1, Freq)), stat="identity", binwidth=1)+labs(x="No. of Incidents", y="Operator Name", main="Top 20 incidents reported by Operator Name" )

We would like to check the top 25 operators, in terms of incidents reported. This is shown below:

top25plot #top 25 Incidents reported by Operator Name

Alternatively we can check how many operators have reported a certain number of incidents. Since more than 560 operators have reported 1 incident lets check the distribution of operators per incident count greater than 1.

op_incd<-table(mydataw$OPERATOR_NAME)# table of operators names  with no. of incident
tdf4<-as.data.frame(op_incd) # temporary dataframe
nops <- c() #vector count
for (i in 1:max(tdf4$Freq)) {
  nops <- c(nops, 0)
} #creating a vector based on highest number of incidents reported from operator
a<-max(tdf4$Freq)
s<-seq(0,a)
for (i in s){
a<-(tdf4$Freq==i)
nops[i]<-sum(a)
}
nops<-cbind(nops, c(1:length(nops)))
opsincd<-as.data.frame(nops)
opsincd<-opsincd[opsincd$V2>1,]
opcountplot<-ggplot(opsincd)+geom_histogram(mapping= aes(x=V2, y=nops), stat="identity")+labs(x="No. of Incidents", y="No. of Operators")+scale_y_continuous(breaks=c(1,10,25, 50, 100, 200))+scale_x_continuous(breaks=c(2,5,10,15,20, 25, 30, 35))
opcountplot

Next, lets check the frequency of incidents occuring in various areas of the mine.

sub_incd<-table(mydataw$SUBUNIT)# table of operators names  with no. of incident
tdf5<-as.data.frame(sub_incd) # temporary dataframe

areaofmineplot<-ggplot(tdf5)+geom_bar(mapping=aes(x=reorder(Var1,Freq), y= Freq), stat="identity", binwidth=1)+labs(x="Area Subunit", y="No. of Incidents")+ coord_flip()+scale_y_continuous(breaks=c(1, 50, 100, 200, 300, 400, 500, 600, 700))
areaofmineplot

We can ascertain that most incident occured underground, in the open pit area or the mill operation area.

Next, lets check the distribution of incidents over the Fiscal years and then Fiscal Quarters. The bar chart shows the total number of incidents in the year.

fisyr<-table(mydataw$FISCAL_YR)
fisyrdf<-as.data.frame(fisyr)
fisqtr<-table(mydataw$FISCAL_YR, mydataw$FISCAL_QTR)
tdf6<-as.data.frame(fisqtr)
QTR<-tdf6$Var2
totalplot<-ggplot(tdf6)+geom_bar(mapping=aes(x=Var1, y= Freq), stat="identity", binwidth=1)+labs(x="Year", y="No. of Incidents")+scale_y_continuous(breaks=c(0, 25, 50, 75, 100, 125, 150, 175, 200))+theme(legend.position="bottom")
Years<-tdf6$Var1

plot8<-ggplot(tdf6)+geom_density(mapping = aes(x = Freq,))+ labs(x="Year")+theme(legend.position="bottom")

tdf6<-tdf6[tdf6$Freq>0,]
QTRS<-tdf6$Var2

qtrplot1<-ggplot(tdf6)+geom_smooth(mapping = aes(x = Var1, y=Freq, group= Var2, col=QTRS), se=F, method= loess)+ labs(x="Years", y="Number of Incidents")+theme(legend.position=c(0.8,0.8)) #+geom_point(mapping = aes(x = Var1, y=Freq, group= Var2, col=QTRS))
totalplot

The line chart shows the distribution of incidents within every quarter over FY2000 to FY2015.

qtrplot1

We can see that over the years the no. of incidents has been gradually decreasing for every quarter and are following the yearly trend from the bar chart.

Moving on, we can identify the degree of injury due to all these incidents. This will help identify what are most common injuries as a result of these incidents.

deginj<-table(mydataw$DEGREE_INJURY)
tdf7<-as.data.frame(deginj)

ggplot(tdf7)+geom_histogram(mapping=aes(x=Freq, y=reorder(Var1, Freq)), stat="Identity")+labs(x="No. of Incidents", y="Degree of Injury")

The degree of injury tells us the nature of the injury based on the consequence, for instance, most incidents resulted in injuries that caused the worker to stay away from work for a number of days.

tdf8<-as.data.frame(table((mydataw$FIPS_STATE_CD)))#state locations
xx<-fips_info()$fips #get state code fips vector
yy<-setdiff(xx, tdf8$Var1) #check missing states between data and total stataes
yy<-yy[-c(2:7)]
yy<-cbind(yy,0)#make 2 columns for missing states
yy<-as.data.frame(yy)# make dataframe for missing state
colnames(yy)<-c("Var1", "Freq") #label columns for missing states
tdf8<-rbind(tdf8,yy) #add to data
aa<-c(tdf8$Var1)
aa<-as.numeric(as.character(aa)) #change fips code value in data from 1 to 01 for matching, required for left join function
bb<-fips_info(aa)
colnames(tdf8)<-c("fips", "Freq" )
tdf8[] <- lapply(tdf8, as.character)
tdf8$fips[1:6]<-c("01",'04', "05", "06", "08", "09")
tdf8<-left_join(tdf8, bb, by="fips")


colnames(tdf8)[4]<-"region"
tdf8$region<-tolower(tdf8$region)# return lower case names from uppercase
usstates<-map_data("state") #map data
mapdata<-left_join(tdf8, usstates)


m1<-ggplot(data = mapdata) + geom_polygon(aes(x=long, y=lat, group=group, fill=region) ) + coord_equal() + geom_path(aes(x = long, y = lat, group=group), color="black", size=0.3) + coord_map(projection = "albers", lat0 = 39, lat1 = 45)+theme(legend.position="bottom") #map plot of us states by region
  

plot10<-m1+geom_text(
    data = mapdata %>%
      group_by(group) %>%
      summarise(Lat = mean(c(max(lat), min(lat))),
                Long = mean(c(max(long), min(long)))) %>%
      mutate(state = group) %>%
      left_join(mapdata, by = c("state" = "group")),
    aes(x = Long, y = Lat, label = mapdata$Freq ), size=2.8
    ) #+coord_flip()
# added incident number as text to plot, clearer in interactive map, remove hashtag before ggplotly(plot10) command to view, is heavy and may take time to load

The map below shows us the state count of number of incidents over the years we can see that some states either have no incidents due to missing data or because the mining industry is non-existent over there. Similarly, we can identify states where most incidents occurred and check if they have proper legislation and safety standards in place.

plot10

Now, let us take a look at incidents that took place in an underground mining area and see which areas are more prone to these incidents.

ugloc<-table(mydataw$UG_LOCATION)
tdf9<-as.data.frame(ugloc)

A quick look in the above dataframe tells us the Data has a Variable “No Value Found” which depicts the incident was not underground as we can see from the area subunit graph that although the highest no. of incident occurred underground, they were not all the incidents. So, we can remove this variable to view our data here. Using cleaning techniques we get the below graph.

tdf9<-tdf9[!tdf9$Var1=="NO VALUE FOUND",]# data cleaning


uglocplot<-ggplot(tdf9)+geom_histogram(aes(x=Freq, y=reorder(Var1,Freq)), stat="identity", binwidth=1)+scale_x_continuous(breaks=c(20,50,70,100, 120,150, 170, 200, 220))+labs(title="Underground Incident Area with Incidents Number", x="No. of Incidents", y="Underground Locations")
uglocplot

Let us also look at the Underground Mining Method being used, lets try to see it along with the previous plot of underground location. This will show us which mining method was being used when these incidents occurred in these locations. the Frequency here depicts the number of incident counts.

ugmeth<-table(mydataw$UG_MINING_METHOD, mydataw$UG_LOCATION)
tdf10<-as.data.frame(ugmeth)
tdf10<-tdf10[!tdf10$Var1=="NO VALUE FOUND",]

ugloc_mm_plot<-ggplot(tdf10, mapping = aes(x = Var2, y = Var1)) + 
  geom_count(mapping = aes(size=Freq, color=Freq)) +
  ggtitle("UG Location and Mining Methods vs No. of Incidents")+ theme(axis.text.x=element_text(angle=90,hjust=1,vjust=0.5))+labs(x="UG Locations of Incidents", y="UG Miining Method of Incidents")
  ugloc_mm_plot

Now lets look at some of the mining equipment in use when these incidents occurred, since there is too many data instances lets look at the top 20 equipment in use when incidents occurred.

minequip<-table(mydataw$MINING_EQUIP)
tdf11<-as.data.frame(minequip)
tdf11<-tdf11[!tdf11$Freq==0 & !tdf11$Var1=="NO VALUE FOUND",]

top20eqdf <- tdf11[order( tdf11$Freq, decreasing = TRUE),] 
top20eqdf<-top20eqdf[1:20,]


top20eq_plot<-ggplot(top20eqdf)+geom_histogram(aes(x=Freq, y=reorder(Var1, Freq)), stat="identity")+scale_x_continuous(breaks=c(1, 10, 25, 50, 75, 100, 125, 150, 175, 200, 225))+labs(x="No. of Incidents", y="Mining Equipment")
top20eq_plot

Now lets take a look at the equipment in use when incident occurred along with the data of the equipment manufacturer.

This is an interactive plot so please view frequency of incidents by hovering over count shapes.

The trend here shows that most of the equipment in use came from manufacturers that were either “Not Listed”, “Not on this List, or”Not reported" which mean s that there is high possibility that these equipments were not manufactured by qualified manufacturers as per approved standards.

minmfr<-table(mydataw$EQUIP_MFR_NAME, mydataw$MINING_EQUIP)
tdf12<-as.data.frame(minmfr)
tdf12<-tdf12[!tdf12$Var1=="NO VALUE FOUND" & !tdf12$Var2=="NO VALUE FOUND" & !tdf12$Freq==0,]

mfr_plot<-ggplot(tdf12, mapping = aes(x = Var1, y = Var2)) + 
  geom_count(mapping = aes(size=Freq, color=Freq)) +
  ggtitle("Mining Equipment & Manufacturer vs No. of Incidents")+ theme(axis.text.x=element_text(angle=90,hjust=1,vjust=0.5))+labs(x="Mining Equipment", y="Equip Manufacturer")
ggplotly(mfr_plot)

We can see that most of the equipment involved in accidents belongs to manufacturers that are either not on the list or have not been reported by the company which would mean that these were not upto the industry standard.

Next we can see the type of accident and nature of accident expressed with the number of incidents. Again, this is an interactive graph to provide better visualization of data.

classify<-table(mydataw$CLASSIFICATION, mydataw$ACCIDENT_TYPE)
tdf13<-as.data.frame(classify)
tdf13<-tdf13[!tdf13$Var1=="NO VALUE FOUND" & !tdf13$Var2=="NO VALUE FOUND" & !tdf13$Freq==0,]


plot14<-ggplot(tdf13, mapping = aes(x = Var2, y = Var1)) + 
  geom_count(mapping = aes(size=Freq, color=Freq)) +
  ggtitle("Classification of Accident and Accident type vs No. of Incidents")+ theme(axis.text.x=element_text(angle=90,hjust=1,vjust=0.5))+labs(x="Accident type", y="Classification of Accident (Circumstances leading to Accident)")
ggplotly(plot14)

Next we will see the number of incidents with respect to the occupation of those involved in these incidents, telling us which jobs are more prone to incidents. As it is a massive dat set, lets review the top 20 occupations involved in incidents.

occup<-table(mydataw$OCCUPATION)
tdf14<-as.data.frame(occup)
tdf14<-tdf14[!tdf14$Var1=="NO VALUE FOUND" & !tdf14$Freq==0,]

top20occdf <- tdf14[order( tdf14$Freq, decreasing = TRUE),] 
top20occdf<-top20occdf[1:20,]


plot15<-ggplot(top20occdf)+geom_histogram(aes(x=Freq, y=reorder(Var1, Freq)), stat="identity")+labs(x="No. of Incidents", y="Occupation", title="Occupation vs No. of Incidents")+scale_x_continuous(breaks=c(1,10,50,100, 150, 200, 250, 300, 350, 400))
plot15

We can see that the maintenance and repair team were involved in most inncidents which makes sense as they are involved directly in plant maintenance and operation, exposing them more to these incidents.

If we would like to look at the activities that these workers were performing when these incidents occurred we can look at another plot, however again we will only see the top 15 activities.

activ<-table(mydataw$ACTIVITY)
tdf15<-as.data.frame(activ)
tdf15<-tdf15[!tdf15$Var1=="NO VALUE FOUND",]

top15actdf <- tdf15[order( tdf15$Freq, decreasing = TRUE),] 
top15actdf<-top15actdf[1:15,]


plot16<-ggplot(top15actdf)+geom_bar(aes(x=Freq, y=reorder(Var1, Freq)),stat="identity")+labs(x="No. of Incidents", y="Activity being Performed", title="Activity vs No. of Incidents")+scale_x_continuous(breaks=c(1, 10, 25,50,100,150,200,250,300))
plot16

We can see that those handling materials and supplies or involved with machine tools werethe ones most involved in incidents.

Moving on, lets see the source of injuries. Lets plot the data for the top 10 sources of injuries in these incidents.

injury<-table(mydataw$INJURY_SOURCE)
tdf16<-as.data.frame(injury)
tdf16<-tdf16[!tdf16$Var1=="NO VALUE FOUND",] #cleaning



top10injdf <- tdf16[order( tdf16$Freq, decreasing = TRUE),] 
top10injdf<-top10injdf[1:10,] #top 10


injury_plot<-ggplot(top10injdf)+geom_bar(aes(x=Freq, y=reorder(Var1, Freq)),stat="identity")+labs(x="No. of Incidents", y="Source of Injury", title="Injury Source vs No. of Incidents")+scale_x_continuous(breaks=c(1, 10, 25,50,100,150,200,250,300))
injury_plot

We see that most common source of injury was metal pipe or wire lying around or being used leading to an accident resulting in injury.

Lets view the nature of injury data to see the extent of the injury, again, lets review only the top 10 results.

natinj<-table(mydataw$NATURE_INJURY)
tdf17<-as.data.frame(natinj)
tdf17<-tdf17[!tdf17$Var1=="NO VALUE FOUND",] #cleaning



top10nat_injdf <- tdf17[order( tdf17$Freq, decreasing = TRUE),] 
top10nat_injdf<-top10nat_injdf[1:10,] #top 10


natinj_plot<-ggplot(top10nat_injdf)+geom_bar(aes(x=Freq, y=reorder(Var1, Freq)),stat="identity")+labs(x="No. of Incidents", y="Nature of Injury", title="Nature of Injury vs No. of Incidents")+scale_x_continuous(breaks=c( 10, 50,100,150,200,250,300, 400, 500, 550))
natinj_plot

We can see above that the most common nature of the injury was sprain, strain followed by cuts and wounds. This can help us plan first aid response as this shows the most likely nature of injury in mining incidents.

Finally, let us look in which form of mining industries most incidents occurred. This will help us plan which industry needs most urgent policy reforms in terms of safety standards.

ind_check<-table(mydataw$COAL_METAL_IND)
tdf18<-as.data.frame(ind_check)
tdf18<-tdf18[!tdf18$Var1=="NO VALUE FOUND",] #cleaning






ind_plot<-ggplot(tdf18)+geom_histogram(aes(x=Var1, y= Freq),stat="identity", binwidth=1)+labs(x="Type of Industry", y="Number of Incidents", title="Industry Type vs No. of Incidents")+scale_y_continuous(breaks=c( 0, 250, 500, 750, 1000, 1250))+ theme(aspect.ratio = 2/1)
ind_plot

As we can see the metal mining industry has reported more incidents overall than the coal industry, but considering that metal mining may include several different metal being mined we have to see coal as the single most common source of incidents in the mining industry and we need to implement better safety codes in the industry to prevent these incidents, which we have observed have reduced over the years.